###############################################################################
# Attention Analysis
###############################################################################
# This Script contains the code used to look at the attention candidates get. 
###############################################################################
# Content
###############################################################################
# 1) Dependencies
# 2) Load Data
# 3) Generate aggregated hits and documents per candidates for 2015 vs 2019
# 4) Fixed effects models
# 5) Fixed Effects Plots of interest
# 5.1) Gender Effects with Party
# 5.2) Gender Effect
###############################################################################
# 1) Dependencies
###############################################################################
library(readr)
library(dplyr)
library(ggplot2)
library(gganimate)
library(ggeffects)
library(ggExtra)
library(ggridges)
library(ggrepel)
library(grid)
library(scales)
library(lubridate)
library(extrafont)
library(reshape2)
library(here)
library(ggforce)
library(png)
library(readxl)
library(grid)
library(gridExtra)
library(ggpubr)
library(sjPlot)
library(ggplot2)
library(lme4)
library(effects)
library(texreg)
library(MASS)
###############################################################################
# 2) Load Data
###############################################################################
# Set Path
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
rm(list=ls())

# Options and Seed
options(stringsAsFactors = F)
set.seed(0213)

# Custom functions
# ggplot rescale x axis....
scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
# ggplot order over facets...
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}

# Source Theme for Figures
source('ggplot_theme_ddl.R', encoding = "UTF-8")

# Load Data
df <- readRDS("../data/smd_ner_2015_2019_combined.RDS")
unique(df$council)

candidates_list_15 <- read.csv('../support/candidates-2015/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% 
  as_tibble %>% mutate(id=as.character(id))
candidates_list_19 <- read.csv('../support/candidates-2019/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% 
  as_tibble %>% mutate(id=as.character(id))

df$year <- as.character(df$year)
df$date <- format(as.Date(df$date, "%m-%d"), format = "%m-%d")
df$fullname <- ifelse(df$fullname == "Adèle Goumaz", "Adèle Thorens Goumaz", df$fullname)

#-----------------------------------------------------------------------------#
# Configurations
#-----------------------------------------------------------------------------#
# Remove Federal Councilors
council <- TRUE
# Remove Party Presidents
president <- FALSE

# Unwanted Topics
unwanted_topics <- c('PoliticalSystem', 'Other_unclassified_Political_Texts', 
                     'NotPolitical', 'Not Classified', 'Other_Problems')

# Council members 2015
council_15 <- c("Ueli Maurer", "Alain Berset", "Didier Burkhalter", 
                "Simonetta Sommaruga", "Eveline Widmer Schlumpf", 
                "Johann Schneider-Ammann", "Doris Leuthard")

# Council members 2019
council_19 <- c("Ueli Maurer", "Alain Berset", "Ignazio Cassis", 
                "Simonetta Sommaruga", "Guy Parmelin",
                "Karin Keller-Sutter", "Viola Amherd")

# Party Presidents 2015
presi_15 <- c("Toni Brunner", "Christian Levrat", "Philipp Müller", 
              "Christophe Darbellay", "Regula Rytz", "Martin Bäumle", "Martin Landolt")

# Party Presidents 2019
presi_19 <- c("Albert Rösti", "Christian Levrat", "Petra Gössi", 
              "Gerhard Pfister", "Regula Rytz", "Jürg Grossen", "Martin Landolt")


# Values for colors
values_gender_1 <- c("f" = "#DD2461", "m" = "#7D7D7C")
shapes_gender_1 <- c("f" = 1, "m" = 5)
.fill2 <- unlist(colourList[['colour']][['parties']])
names(.fill2) <- toupper(names(.fill2))
.fill2[22] <- "grey"
names(.fill2) <- c(names(.fill2)[1:21],"Rest")
values_year_2 <- c("2015" = "#DD2461", "2019" = "#7D7D7C")
values_gender_2 <- c("Women" = "#DD2461", "Men" = "#7D7D7C")


# Remove smaller News Papers
regional_national_newspapers <- c(
  "20 minuten", #National Newspaper (DE) 
  "20 minutes", #National Newspaper (FR)
  "20 minuti", #National Newspaper (IT)
  "24 heures", #National Newspaper (FR)
  "srf.ch", #National Newspaper (DE)
  "rts.ch", #National Newspaper (FR)
  "Aargauer Zeitung / MLZ", #Big Regional Newspaper (DE)
  "Arcinfo", #Medium Regional Newspaper (FR)
  "Basler Zeitung", #Big Regional Newspaper (DE)
  "Berner Zeitung", #Big Regional Newspaper (DE)
  "Blick", #National Newspaper (DE)
  "Bote der Urschweiz", #Medium Regional Newspaper (DE)
  "Bündner Tagblatt", #Medium Regional Newspaper (DE)
  "Der Bund", #National Newspaper (DE)
  "Der Landbote", #Medium Regional Newspaper (DE)
  "Die Weltwoche", #National Newspaper (DE)
  "Die Wochenzeitung", #National Newspaper (DE)
  "Finanz und Wirtschaft", #National Newspaper (DE)
  "La Liberté", #Medium Regional Newspaper (FR)
  "Le Matin", #National Newspaper (FR)
  "Le Matin Dimanche", #National Newspaper (FR)
  "Le Temps", #National Newspaper (FR)
  "Luzerner Zeitung", #Big Regional Newspaper (DE)
  "Neue Luzerner Zeitung", #Big Regional Newspaper (DE)
  "Neue Zürcher Zeitung", #National Newspaper (DE)
  "NZZ am Sonntag", #National Newspaper (DE)
  "Ostschweiz am Sonntag", #National Newspaper (DE)
  "Schweiz am Sonntag / MLZ", #National Newspaper (DE)
  "Sonntagsblick", #National Newspaper (DE)
  "SonntagsZeitung", #National Newspaper (DE)
  "St. Galler Tagblatt", #Big Regional Newspaper (DE)
  "Südostschweiz", #Medium Regional Newspaper (DE)
  "swissinfo.ch", #National Newspaper (DE)
  "Tages-Anzeiger", #National Newspaper (DE)
  "Tribune de Genève", #Big Regional Newspaper (FR)
  "watson.ch", #National Newspaper (DE)
  "www.20minutes.ch", #National Newspaper (DE)
  "www.lematin.ch" #National Newspaper (FR)
)

df <- df %>% filter(so_txt %in% regional_national_newspapers)
sort(unique(df$so_txt))

###############################################################################
# 3) Generate aggregated hits and documents per candidates for 2015 vs 2019
###############################################################################
df_hits <- df %>% group_by(year, fullname, person.id, gender, party, canton, list_place_1, age, incumbent, selectsclass) %>% summarise(n.hits = n())

# Add Candidates with zero mentions via the lists of candidates:
names(df_hits)

helper <- filter(df_hits, year == 2015)
candidates_list_15 <- candidates_list_15 %>% filter(!id %in% helper$person.id)

helper <- filter(df_hits, year == 2019)
candidates_list_19 <- candidates_list_19 %>% filter(!id %in% helper$person.id)

# Change Council Var to same standard...
candidates_list_19 <- candidates_list_19 %>% mutate(candidacy = as.character(gsub("\\s", " ", council))) %>% 
  mutate(council = case_when(candidacy %in% c("SR", "Former Staenderat", "Former Staenderat") ~ "sr",
                             candidacy %in% c("NR", "Former Nationalrat", "Former Nationalrat") ~  "nr",
                             candidacy %in% c("SR und NR", "NR und SR") ~ "sr & nr")) %>% 
  dplyr::select(-c(candidacy))


candidates_list_15 <- candidates_list_15 %>% dplyr::select(c(age,district,fullname,gender,incumbent,list_place_1,party,id)) %>% 
  dplyr::mutate(person.id = id,
                canton = district,
                n.hits = 0,
                selectsclass = NA,
                year = "2015") %>% 
  dplyr::select(-c("district","id"))



candidates_list_19 <- candidates_list_19 %>% dplyr::select(c(age,district,fullname,gender,incumbent,list_place_1,party,id)) %>% 
  dplyr::mutate(person.id = id,
                canton = district,
                n.hits = 0,
                selectsclass = NA,
                year = "2019") %>% 
  dplyr::select(-c("district","id"))

df_hits <- dplyr::bind_rows(df_hits,candidates_list_15,candidates_list_19)

df_hits_15 <- df_hits %>% mutate(year = as.numeric(year)) %>% filter(year == 2015) %>% group_by(fullname, person.id, gender, party, canton, list_place_1, age, incumbent) %>%  
  tidyr::complete(selectsclass = unique(df_hits$selectsclass), fill = list(n.hits = 0,
                                                                           year = 2015))
# Complete Selectsclass for each Person in each Year
df_hits_19 <- df_hits %>% mutate(year = as.numeric(year)) %>% filter(year == 2019) %>% group_by(fullname, person.id, gender, party, canton, list_place_1, age, incumbent) %>%  
  tidyr::complete(selectsclass = unique(df_hits$selectsclass), fill = list(n.hits = 0,
                                                                           year = 2019))
# Sanity Check
length(unique(df_hits_19$person.id))
length(unique(df_hits_15$person.id))

# Combine the completed dfs
df_hits <- dplyr::bind_rows(df_hits_15,df_hits_19)

# Remove the Class NA
df_hits <- df_hits %>% filter(is.na(selectsclass) == F) %>% filter(is.na(person.id) == F)

# Sanity Check 
sanity <- df_hits %>% group_by(year, selectsclass) %>% summarise(n = n())
sanity

# Initialize files:
df_hits_f <- df_hits
df_time <- df

# Remove federal councilors
if(council == TRUE){
  df_hits_f <- df_hits_f %>% dplyr::filter((year == "2015" & !fullname %in% council_15) | 
                                             (year == "2019" & !fullname %in% council_19))
  df_time <- df_time %>% dplyr::filter((year == "2015" & !fullname %in% council_15) | 
                                         (year == "2019" & !fullname %in% council_19))
}

# Remove Party Presidents (do not do this use a control variable)
if(president == TRUE){
  df_hits_f <- df_hits_f %>% dplyr::filter((year == "2015" & !fullname %in% presi_15) | 
                                             (year == "2019" & !fullname %in% presi_19))
  df_time <- df_time %>% dplyr::filter((year == "2015" & !fullname %in% presi_15) | 
                                         (year == "2019" & !fullname %in% presi_19))
}

###############################################################################
# 4) Analysis and Plot Loop
###############################################################################
#-----------------------------------------------------------------------------#
# Setup (With Presidents (Topics / All) / Without Presidents  (Topics / All))
#-----------------------------------------------------------------------------#

Split_No_President <- c("split_","no_presidents")
No_President <- c("","no_presidents")
Split_President <- c("split_", "presidents")
President <- c("","presidents")

configs <- list(Split_No_President, No_President, Split_President, President)
#-----------------------------------------------------------------------------#
# Add Week to Data:
#-----------------------------------------------------------------------------#
df_time <- df_time %>% mutate(pubDateTime = ymd(pubDateTime),
                              weekd = week(pubDateTime),
                              yeard = year(pubDateTime))
# isolate first weekdays of every week to aggregate data on weekly level
first_weekday <- df_time %>% 
  dplyr::mutate() %>% 
  dplyr::group_by(yeard, weekd) %>% 
  dplyr::arrange(pubDateTime) %>% 
  dplyr::filter(row_number()==1) %>% 
  dplyr::select(yeard, weekd, firstday = date) %>%
  mutate(firstday = as.Date(firstday, format = "%m-%d"))


# join everything back to the initial datasets
df_time <- left_join(df_time, first_weekday)  

df_time <- df_time %>%  mutate(month = month(firstday),
                               day = day(firstday)) %>% 
                        mutate(firstday = as.Date(paste(yeard,month,day, sep = '-')))

#-----------------------------------------------------------------------------#
# Loop
#-----------------------------------------------------------------------------#

for(m in 1:length(configs)){
  #---------------------------------#
  # Data Preparation
  #---------------------------------#
  
  ## with or without party presidents
  if(configs[[m]][2] == "presidents"){
    df_hits_loop <- df_hits_f
    df_time_loop <- df_time
  } else {
    df_hits_loop <- df_hits_f %>% dplyr::filter((year == "2015" & !fullname %in% presi_15) | 
                                                  (year == "2019" & !fullname %in% presi_19))
    
    df_time_loop <- df_time %>% dplyr::filter((year == "2015" & !fullname %in% presi_15) | 
                                                (year == "2019" & !fullname %in% presi_19))
  }
  
  ## with split topics or without
  if(configs[[m]][1] == "split_"){
    plot_df_1 <- df_time %>%  filter(gender %in% c("f", "m")) %>%
      group_by(year, date, selectsclass, gender) %>% 
      summarise(n = n())  %>%
      ungroup %>% 
      group_by(year, date, selectsclass) %>%
      mutate(sum_art_class = sum(n),
             freq = n / sum(n),
             perc = (n / sum(n)) * 100)
    
    plot_df_1$date <- as.Date(plot_df_1$date, "%m-%d")
    
    plot_df_2 <- df_time %>%
      group_by(year, date, selectsclass, gender, doc.id) %>% 
      summarise(n = n()) %>%
      ungroup() %>% 
      group_by(year,date, selectsclass) %>% 
      mutate(sum_day = sum(n)) %>%
      ungroup %>%
      group_by(year,date,selectsclass, gender) %>% 
      mutate(n_2 = n()) %>% 
      ungroup %>% 
      group_by(year, date, selectsclass) %>%
      mutate(freq = n_2 / sum_day,
             perc = (n_2 / sum_day) * 100) %>% 
      dplyr::select(-c(doc.id,n)) %>%
      distinct(.,.keep_all = T) %>%
      filter(gender %in% c("f","m"))
    
    plot_df_2$date <- as.Date(plot_df_2$date, "%m-%d")
    
    # Topic Names re-code and filter
    plot_df_1 <- plot_df_1 %>% dplyr::filter(!selectsclass%in% c('PoliticalSystem', 'Other_unclassified_Political_Texts', 'NotPolitical', 'Not Classified', 'Other_Problems', 'Regions_NationalCohesion'))
    plot_df_2 <- plot_df_2 %>% dplyr::filter(!selectsclass%in% c('PoliticalSystem', 'Other_unclassified_Political_Texts', 'NotPolitical', 'Not Classified', 'Other_Problems', 'Regions_NationalCohesion'))
    
    # recode the classes to German
    vec <- sort(unique(plot_df_1$selectsclass))
    .recodr <- list('Agriculture', 'Economy', 'Education & Culture', 'Environment', 'Europe', 
                    'Finance & Taxes', 'Gender', 'Immigration', 
                    'International Relations',
                    'Labour Market', 'Law & Order', 'Public Health',
                    'Services & Infrastructure',
                    'Social Security') %>% 
      setNames(., vec)
    
    .recodrnobreaks <- list('Agriculture', 'Economy', 'Education & Culture', 'Environment', 'Europe', 
                            'Finance & Taxes', 'Gender', 'Immigration', 
                            'International Relations',
                            'Labour Market', 'Law & Order', 'Public Health',
                            'Services & Infrastructure',
                            'Social Security') %>% 
      setNames(., vec)
    
    plot_df_1 <- plot_df_1 %>%  mutate(selectsclass=dplyr::recode(selectsclass, !!!.recodr),
                                       selectsclass=factor(selectsclass))
    
    # recode the classes to German
    vec <- sort(unique(plot_df_2$selectsclass))
    .recodr <- list('Agriculture', 'Economy', 'Education & Culture', 'Environment', 'Europe', 
                    'Finance & Taxes', 'Gender', 'Immigration', 
                    'International Relations',
                    'Labour Market', 'Law & Order', 'Public Health',
                    'Services & Infrastructure',
                    'Social Security') %>% 
      setNames(., vec)
    
    .recodrnobreaks <- list('Agriculture', 'Economy', 'Education & Culture', 'Environment', 'Europe', 
                            'Finance & Taxes', 'Gender', 'Immigration', 
                            'International Relations',
                            'Labour Market', 'Law & Order', 'Public Health',
                            'Services & Infrastructure',
                            'Social Security') %>% 
      setNames(., vec)
    
    plot_df_2 <- plot_df_2 %>%  mutate(selectsclass=dplyr::recode(selectsclass, !!!.recodr),
                                       selectsclass=factor(selectsclass))
    
    
    
    topic_t <- unique(plot_df_1$selectsclass)
    topic_t2<- unique(plot_df_2$selectsclass)
    #remove_t <- c('PoliticalSystem', 'Other_unclassified_Political_Texts', 'NotPolitical', 'Not Classified', 'Other_Problems')
    #topic_t <- topic_t[!topic_t %in% remove_t]
    
    write_csv(plot_df_1, paste0("../data/big_newspapers_only_diff_in_diff_m_vs_f_topic_split_",configs[[m]][2],".csv"))
    write_csv(plot_df_2, paste0("../data/big_newspapers_only_diff_in_diff_m_vs_f_vs_NA_topic_split_",configs[[m]][2],".csv"))
    
    # Factorize some variables
    df_hits_loop$party <- as.factor(df_hits_loop$party)
    df_hits_loop$canton <- as.factor(df_hits_loop$canton)
    df_hits_loop$list_place <- 0
    df_hits_loop$list_place[df_hits_loop$list_place_1 %in% c(1,2,3)] <- 1
    df_hits_loop$age_sq <- df_hits_loop$age * df_hits_loop$age
    df_hits_loop$incumbent <- as.factor(df_hits_loop$incumbent)
    df_hits_loop$gender <- as.factor(df_hits_loop$gender)
    df_hits_loop$selectsclass <- as.factor(df_hits_loop$selectsclass)
    
    
  } else {
    df_hits_loop <- df_hits_loop %>%  dplyr::group_by(year, fullname, gender, party, canton, list_place_1, age, incumbent) %>% 
      dplyr::summarise(n.hits = sum(n.hits))
    
    
    # Factorize some variables
    df_hits_loop$party <- as.factor(df_hits_loop$party)
    df_hits_loop$canton <- as.factor(df_hits_loop$canton)
    df_hits_loop$list_place <- 0
    df_hits_loop$list_place[df_hits_loop$list_place_1 %in% c(1,2,3)] <- 1
    df_hits_loop$age_sq <- df_hits_loop$age * df_hits_loop$age
    df_hits_loop$incumbent <- as.factor(df_hits_loop$incumbent)
    df_hits_loop$gender <- as.factor(df_hits_loop$gender)
    
    
    plot_df_1 <- df_time %>% filter(gender %in% c("f", "m")) %>%
      group_by(year, date, gender) %>% 
      summarise(n = n()) %>% 
      ungroup %>% 
      group_by(year, date) %>%
      mutate(sum_art = sum(n),
             freq = n / sum(n),
             perc = (n / sum(n)) * 100)
    
    plot_df_1$date <- as.Date(plot_df_1$date, "%m-%d")
    
    topic_t <- "All Together"
    
    write_csv(plot_df_1, paste0("../data/big_newspapers_only_diff_in_diff_m_vs_f_no_split_",configs[[m]][2],".csv"))
    
    
    plot_df_2 <- df_time %>%
      group_by(year, date, gender, doc.id) %>% 
      summarise(n = n()) %>% 
      ungroup %>% 
      group_by(year, date) %>%
      mutate(sum_day = sum(n)) %>%
      ungroup %>%
      group_by(year,date, gender) %>% 
      mutate(n_2 = n()) %>% 
      ungroup %>% 
      group_by(year, date) %>%
      mutate(freq = n_2 / sum_day,
             perc = (n_2 / sum_day) * 100) %>% 
      dplyr::select(-c(doc.id,n)) %>%
      distinct(.,.keep_all = T) %>%
      filter(gender %in% c("f","m"))
    
    plot_df_2$date <- as.Date(plot_df_2$date, "%m-%d")
    
    topic_t2 <- "All Together"
    
    write_csv(plot_df_2, paste0("../data/big_newspapers_only_diff_in_diff_m_vs_f_vs_NA_no_split_absolute_",configs[[m]][2],".csv"))
  }
}
